On of the most known measures for ranking countries is HDI - Human Development Index. It takes into account: life expectancy at birth (in years), expected years of schooling, mean years of schooling and gross national income per capita. In this report we decided to choose top 24 countries from Latest Human Development Index Ranking and take a closer look at additional information from World Development Indicators in their context. Moreover, we tried to find correlations between Bitcoins’ data from 2009 to 2021 and S.P.Composite dataset. Finally, short-term prediction for Bitcoin Market Price USD was made.
Top 24 countries according to http://hdr.undp.org/en/content/latest-human-development-index-ranking. Because Liechtenstein had many missing values in WDI dataset, we decided to change it to the next top country in the index - France.
hdiTop <- c('Norway', 'Ireland', 'Switzerland', 'Iceland', 'Hong Kong', 'Germany', 'Sweden', 'Netherlands', 'Australia', 'Denmark', 'Finland', 'Singapore', 'United Kingdom', 'New Zealand', 'Belgium', 'Canada', 'United States', 'Austria', 'Israel', 'Japan', 'Slovenia', 'Luxembourg', 'Korea, Rep.', 'Spain', 'France')
During generation of this report we used the following libraries:
libraries
## [1] "dplyr" "tidyr" "ggplot2" "readxl" "imputeTS" "vtable"
## [7] "skimr" "DT" "caret" "gganimate" "ggpubr"
currExchRates <- read.csv("./datasets/CurrencyExchangeRates.csv")
goldPrices <- read.csv("./datasets/Gold prices.csv")
spComposite <- read.csv("./datasets/S&P Composite.csv")
worldDevInd <- read_excel("./datasets/World_Development_Indicators.xlsx", trim_ws=TRUE, na = "..")
worldDevInd <- data.frame(worldDevInd)
btcMetadata <- read.csv("./datasets/BCHAIN_metadata.csv")
btcDiff <- read.csv("./datasets/BCHAIN-DIFF.csv")
btcHrate <- read.csv("./datasets/BCHAIN-HRATE.csv")
btcMkpru <- read.csv("./datasets/BCHAIN-MKPRU.csv")
btcTrvou <- read.csv("./datasets/BCHAIN-TRVOU.csv")
In order to clean the Currency Exchange Rates Dataset we inserted 0’s in Euro currency before ’98 and used special function to fill empty individual places by making mean from adjacent values.
CurrExchRatesClean <- currExchRates %>% select(which(colMeans(is.na(.)) < 0.2))
CurrExchRatesClean <- CurrExchRatesClean %>% mutate(Euro = ifelse(Date < as.Date("1998-10-30", format="%Y-%m-%d"), 0, Euro))
CurrExchRatesClean <- CurrExchRatesClean %>% mutate(Date = as.Date(Date, format= "%Y-%m-%d"))
CurrExchRatesClean <- na_ma(CurrExchRatesClean, k = 1)
st(CurrExchRatesClean)
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| Australian.Dollar | 5978 | 0.768 | 0.143 | 0.483 | 0.665 | 0.869 | 1.105 |
| Bahrain.Dinar | 5978 | 0.376 | 0 | 0.376 | 0.376 | 0.376 | 0.376 |
| Brazilian.Real | 5978 | 2.158 | 0.765 | 0.832 | 1.72 | 2.717 | 4.195 |
| Canadian.Dollar | 5978 | 1.266 | 0.187 | 0.917 | 1.086 | 1.403 | 1.613 |
| Colombian.Peso | 5978 | 2082.838 | 589.734 | 833.18 | 1795.37 | 2450.322 | 3434.89 |
| Danish.Krone | 5978 | 6.282 | 0.912 | 4.665 | 5.612 | 6.804 | 9.006 |
| Euro | 5978 | 1.007 | 0.476 | 0 | 0.938 | 1.314 | 1.599 |
| Icelandic.Krona | 5978 | 92.418 | 24.739 | 54.72 | 70.42 | 116.888 | 147.98 |
| Indian.Rupee | 5978 | 48.181 | 9.451 | 31.37 | 43.133 | 52.967 | 68.778 |
| Japanese.Yen | 5978 | 107.974 | 14.065 | 75.86 | 100.765 | 118.395 | 147 |
| Korean.Won | 5978 | 1103.062 | 160.752 | 756 | 1016.675 | 1186.8 | 1964.8 |
| Kuwaiti.Dinar | 5978 | 0.294 | 0.011 | 0.265 | 0.287 | 0.303 | 0.309 |
| Libyan.Dinar | 5978 | 1.507 | 0.61 | 0.525 | 0.662 | 1.932 | 1.932 |
| Malaysian.Ringgit | 5978 | 3.504 | 0.491 | 2.436 | 3.182 | 3.8 | 4.725 |
| Nepalese.Rupee | 5978 | 77.184 | 15.094 | 49.88 | 68.475 | 84.698 | 109.98 |
| New.Zealand.Dollar | 5978 | 0.66 | 0.118 | 0.393 | 0.579 | 0.736 | 0.882 |
| Norwegian.Krone | 5978 | 6.964 | 1.087 | 4.959 | 6.107 | 7.803 | 9.606 |
| Pakistani.Rupee | 5978 | 70.139 | 23.441 | 30.877 | 51.866 | 90.937 | 115.697 |
| Qatar.Riyal | 5978 | 3.64 | 0 | 3.64 | 3.64 | 3.64 | 3.64 |
| Rial.Omani | 5978 | 0.384 | 0 | 0.384 | 0.384 | 0.384 | 0.385 |
| Saudi.Arabian.Riyal | 5978 | 3.749 | 0.002 | 3.745 | 3.745 | 3.75 | 3.75 |
| Singapore.Dollar | 5978 | 1.503 | 0.183 | 1.201 | 1.361 | 1.688 | 1.851 |
| South.African.Rand | 5978 | 8.076 | 2.849 | 3.53 | 6.218 | 9.941 | 16.771 |
| Sri.Lanka.Rupee | 5978 | 102.232 | 28.663 | 49.57 | 77.54 | 126.29 | 157.646 |
| Swedish.Krona | 5978 | 7.731 | 1.084 | 5.843 | 6.832 | 8.376 | 10.995 |
| Swiss.Franc | 5978 | 1.211 | 0.249 | 0.725 | 0.979 | 1.389 | 1.823 |
| Thai.Baht | 5978 | 35.229 | 5.531 | 24.44 | 31.568 | 39.447 | 56.061 |
| Trinidad.And.Tobago.Dollar | 5978 | 6.309 | 0.18 | 5.839 | 6.26 | 6.376 | 6.789 |
| U.A.E..Dirham | 5978 | 3.672 | 0.001 | 3.671 | 3.672 | 3.672 | 3.675 |
| U.K..Pound.Sterling | 5978 | 1.615 | 0.174 | 1.213 | 1.518 | 1.675 | 2.102 |
| U.S..Dollar | 5978 | 1 | 0 | 1 | 1 | 1 | 1 |
goldPricesClean <- goldPrices %>% mutate(
EURO..AM. = ifelse(Date < as.Date("1998-12-31", format="%Y-%m-%d"), 0, EURO..AM.),
EURO..PM. = ifelse(Date < as.Date("1998-12-31", format="%Y-%m-%d"), 0, EURO..PM.),
USD..PM. = ifelse(Date < as.Date("1968-03-14", format="%Y-%m-%d"), 0, USD..PM.),
GBP..PM. = ifelse(Date < as.Date("1968-03-14", format="%Y-%m-%d"), 0, GBP..PM.))
goldPricesClean <- goldPricesClean %>% mutate(Date = as.Date(Date, format= "%Y-%m-%d"))
goldPricesClean <- na_ma(goldPricesClean, k = 1)
st(goldPricesClean)
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| USD..AM. | 13585 | 575.166 | 490.165 | 34.77 | 280.5 | 841.75 | 2061.5 |
| USD..PM. | 13585 | 574.853 | 490.136 | 0 | 280.3 | 841.1 | 2067.15 |
| GBP..AM. | 13585 | 370.554 | 353.367 | 14.485 | 177.63 | 453.906 | 1574.37 |
| GBP..PM. | 13585 | 370.381 | 353.32 | 0 | 177.615 | 453.787 | 1569.59 |
| EURO..AM. | 13585 | 337.372 | 477.165 | 0 | 0 | 576.189 | 1743.82 |
| EURO..PM. | 13585 | 337.309 | 477.073 | 0 | 0 | 576.776 | 1743.43 |
worldDevInd$Series.Name<-gsub("\\$","D",worldDevInd$Series.Name)
worldDevIndClean <- worldDevInd %>% select(-Series.Code) %>%
pivot_longer('X1970..YR1970.':'X2020..YR2020.', names_to = "Year", values_to = "Value", values_drop_na = FALSE) %>%
pivot_wider(names_from = "Series.Name", values_from = Value) %>% unchop(everything())
worldDevIndClean <- worldDevIndClean %>%
mutate(Year = substr(Year, 2, 5)) %>%
filter(grepl('', Country.Code)) %>%
relocate(Year, .after = Country.Code)
trial <- as.data.frame(worldDevIndClean)
trial <- worldDevIndClean %>% filter(Country.Name %in% hdiTop)
#trial <- trial %>% filter(Year==2020) %>% arrange(`GDP (current USD)`)
trial <- trial %>% select(which(colMeans(is.na(.)) < 0.10))
trial[is.na(trial)] <- 0
wdiTop <- trial
datatable(skim(wdiTop %>% dplyr::select(-Country.Name, -Country.Code, -Year)) %>% dplyr::select(skim_variable, numeric.mean, numeric.sd, numeric.p0, numeric.p25, numeric.p75, numeric.p100) %>% dplyr::rename(Variable=skim_variable, Mean=numeric.mean, "Std. Dev."=numeric.sd, Min=numeric.p0, "Pctl. 25"=numeric.p25, "Pctl. 75"=numeric.p75, Max=numeric.p100))
spCompositeClean <- spComposite %>%
mutate(Cyclically.Adjusted.PE.Ratio = ifelse(Year < as.Date("1880-12-31", format="%Y-%m-%d"), 0, Cyclically.Adjusted.PE.Ratio)) %>% rename(Date = Year)
spCompositeClean <- spCompositeClean%>% mutate(Date = as.Date(Date, format= "%Y-%m-%d"))
spCompositeClean <- na_ma(spCompositeClean, k = 1)
st(spComposite)
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| S.P.Composite | 1810 | 327.968 | 685.029 | 2.73 | 7.903 | 164.4 | 4493.28 |
| Dividend | 1806 | 6.732 | 12.069 | 0.18 | 0.42 | 7.053 | 59.68 |
| Earnings | 1806 | 15.371 | 28.933 | 0.16 | 0.561 | 14.726 | 158.74 |
| CPI | 1810 | 62.392 | 76.26 | 6.28 | 10.2 | 102.275 | 273.983 |
| Long.Interest.Rate | 1810 | 4.504 | 2.306 | 0.62 | 3.171 | 5.139 | 15.32 |
| Real.Price | 1810 | 621.987 | 733.091 | 73.899 | 186.575 | 706.991 | 4477.204 |
| Real.Dividend | 1806 | 17.498 | 11.114 | 5.445 | 9.417 | 22.301 | 63.511 |
| Real.Earnings | 1806 | 34.907 | 29.841 | 4.576 | 14.063 | 43.768 | 159.504 |
| Cyclically.Adjusted.PE.Ratio | 1690 | 17.215 | 7.048 | 4.784 | 11.898 | 20.913 | 44.198 |
btcTrvou <- btcTrvou %>% rename(Trvou = Value)
btcMkpru <- btcMkpru %>% rename(Mkpru = Value)
btcDiff <- btcDiff %>% rename(Diff = Value)
btcHrate <- btcHrate %>% rename(Hrate = Value)
btcAllClean <- btcTrvou %>% left_join(btcMkpru) %>% left_join(btcHrate) %>% left_join(btcDiff)
btcAllClean <- btcAllClean %>% mutate(Date = as.Date(Date, format= "%Y-%m-%d"))
btcAllClean <- na_ma(btcAllClean, k = 1)
st(btcAllClean)
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| Trvou | 4661 | 146728050.012 | 330500192.236 | 0 | 194787.108 | 148443297.885 | 5352015515.54 |
| Mkpru | 4661 | 5141.227 | 10990.321 | 0 | 7.2 | 6499.062 | 63554.44 |
| Hrate | 4661 | 26480643.671 | 46431207.364 | 0 | 12.016 | 38267606.403 | 198514005.71 |
| Diff | 4661 | 3667535953756.88 | 6452342784118.23 | 0 | 1689334.405 | 5363678461480 | 25046487590083 |
btcAnalysis <- btcAllClean %>% pivot_longer(cols='Trvou':'Diff', names_to = 'Param', values_to = 'Value')
ggplot(btcAnalysis, aes(x=Date, y=Value, color = as.factor(Param))) + geom_line() + facet_wrap(~Param, scales = "free_y") + theme_minimal() + labs(colour = "Bitcoin's Parameter", title = "Bitcoin's Overview")
GoldAnalysis <- goldPricesClean %>% pivot_longer(cols = 'USD..AM.':'EURO..PM.', names_to = 'Curr', values_to = 'Value')
ggplot(data = GoldAnalysis, aes(x = Date, y = Value, color=as.factor(Curr))) +
geom_point(size=0.1) + facet_wrap(~Curr) + theme_minimal() + labs(color='Currency', title='Gold Prices\' Overview')
ggplot(data = spCompositeClean, aes(x = Date, y = S.P.Composite)) +
geom_line(color='blue') + theme_minimal() + labs(title='S.P.Composite through Years') + transition_reveal(Date)
btcPlusSP <- btcAllClean %>% inner_join(spCompositeClean, on='Date')
## Joining, by = "Date"
btcPlusGold <- btcAllClean %>% inner_join(goldPricesClean, on='Date')
## Joining, by = "Date"
ggscatter(btcAllClean, x = "Mkpru", y = "Hrate",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Mkpru", ylab = "Hrate", color = 'plum4')
## `geom_smooth()` using formula 'y ~ x'
ggscatter(btcPlusSP, x = "Mkpru", y = "S.P.Composite",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Mkpru", ylab = "S.P.Composite", color='maroon2')
## `geom_smooth()` using formula 'y ~ x'
ggscatter(btcPlusSP, x = "Mkpru", y = "CPI",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Mkpru", ylab = "CPI", color='indianred3')
## `geom_smooth()` using formula 'y ~ x'
ggscatter(btcPlusSP, x = "Mkpru", y = "Earnings",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Mkpru", ylab = "Earnings", color='coral2')
## `geom_smooth()` using formula 'y ~ x'
btcYearsSummary <- btcAllClean %>% mutate(Year = format(as.Date(Date, format="%d/%m/%Y"),"%Y")) %>% select(-Date) %>% group_by(Year) %>% dplyr::summarize(MeanDiff = mean(Diff), MeanHrate=mean(Hrate), MeanMkpru=mean(Mkpru), MeanTrvou=mean(Trvou)) %>% filter(Year > 2008 & Year <2021)
btcPlusCountry <- btcYearsSummary %>% inner_join(wdiTop, on='Year')
## Joining, by = "Year"
for (country in hdiTop) {
countryExample <- btcPlusCountry %>% filter(Country.Name == country)
check <- cor(x = countryExample$MeanMkpru, y = countryExample$`GDP per capita (current USD)`, method = "pearson", use = "everything")
if (check < 0.7 | is.na(check)) {next}
print(ggscatter(countryExample, x = "MeanMkpru", y = "GDP per capita (current USD)",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "MeanMkpru", ylab = "GDP per capita (current USD)", color='orchid2', title = country))
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
for (country in hdiTop) {
countryExample <- btcPlusCountry %>% filter(Country.Name == country)
check <- cor(x = countryExample$MeanMkpru, y = countryExample$`Urban population growth (annual %)`, method = "pearson", use = "everything")
if (check < 0.7 | is.na(check)) {next}
print(ggscatter(countryExample, x = "MeanMkpru", y = "Urban population growth (annual %)",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "MeanMkpru", ylab = "Urban population growth (annual %)", color='maroon1', title = country))
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
set.seed(23)
btcAllClean2 <- btcAllClean %>% select(-Date)
inTraining <- createDataPartition(y = btcAllClean2$Mkpru, p = .70, list = FALSE)
train <- btcAllClean2[inTraining,]
valid_and_test <- btcAllClean2[-inTraining,]
inTesting <- createDataPartition(y = valid_and_test$Mkpru, p = .50, list = FALSE)
valid <- valid_and_test[ -inTesting,]
test <- valid_and_test[ inTesting,]
ctrl <- trainControl(
# powtórzona ocena krzyżowa
method = "repeatedcv",
# liczba podziałów
number = 2,
# liczba powtórzeń
repeats = 5)
fit <- train(Mkpru ~ .,
data = train,
method = "rf",
trControl = ctrl,
# Paramter dla algorytmu uczącego
ntree = 10)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
fit
## Random Forest
##
## 3265 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (2 fold, repeated 5 times)
## Summary of sample sizes: 1633, 1632, 1633, 1632, 1632, 1633, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 2888.579 0.9342471 853.6425
## 3 2783.717 0.9393712 762.8295
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 3.
pred <- predict(fit, newdata = test)
preddf <- data.frame(Pred = pred, Mkpru = test$Mkpru)
#View(preddf)
datatable(preddf)
print(ggscatter(preddf, x = "Mkpru", y = "Pred",
xlab = "Mkpru", ylab = "Predicted Mkpru", color='maroon1') + geom_smooth())
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'